#load libraries
import numpy as np
import pandas as pd
import csv
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.read_csv('datasets/dataset1.csv',low_memory=False)
df = pd.DataFrame(df)
#columns 9 to 158 contain the required information
data = df.iloc[:,9:159]
data = data.fillna(0)
Principal Component Analysis (PCA) is a statistical technique used for dimensionality reduction. It transforms a large set of features or variables into a smaller set, while simultaneously preserving as much information as possible.
Here, we have used PCA to detect outliers amongst all the recipes based on their nutritional values.
The recipes have been categorised based on 6 continents - Europe, Asia, Africa, South (Latin) America, North America and Australia.
Y = df['Continent']
#scale the data
scaled_data = preprocessing.scale(data)
#create two principal components
pca = PCA(n_components=2)
pca.fit(scaled_data)
components = pca.transform(scaled_data)
#to plot
fig = px.scatter(components, x=0, y=1, color=df['Continent'], hover_name=df['Recipe_title'], title='PCA - Recipes')
#save the plot
plotly.offline.plot(fig, filename='plots/pca_recipes.html')
fig.show()